data <- read.csv(file = 'heart.csv')
# Pre processing
data[["age"]] = data[["ï..age"]]
data = data[,!(names(data) %in% c("ï..age"))]
names(data)
## [1] "sex" "cp" "trestbps" "chol" "fbs" "restecg"
## [7] "thalach" "exang" "oldpeak" "slope" "ca" "thal"
## [13] "target" "age"
copy = data.frame(data)
numerical_attributes = c("age", "trestbps", "chol", "oldpeak", "thalach")
numerical_attributes
## [1] "age" "trestbps" "chol" "oldpeak" "thalach"
data$age[data$age < 50] = "young"
data$age[!data$age %in% c("young")] = "old"
as.data.frame(table(data$age))
data$trestbps[data$trestbps < 120] = "normal blood pressure"
data$trestbps[!data$trestbps %in% c("normal blood pressure")] = "high blood pressure"
as.data.frame(table(data$trestbps))
data$chol[data$chol < 200] = "normal chol"
data$chol[!data$chol %in% c("normal chol")] = "high chol"
as.data.frame(table(data$chol))
assign_value = function(index, low, high) {
if (copy$thalach[index] < low) {
data$thalach[index] <<- "low HR"
} else if (copy$thalach[index] > high) {
data$thalach[index] <<- "high HR"
} else {
data$thalach[index] <<- "normal HR"
}
}
for (i in 1:length(copy$age)) {
if (copy$age[i] < 20) {
assign_value(i, 100, 170)
} else if (copy$age[i] < 30) {
assign_value(i, 95, 162)
} else if (copy$age[i] < 35) {
assign_value(i, 93, 157)
} else if (copy$age[i] < 40) {
assign_value(i, 90, 153)
} else if (copy$age[i] < 45) {
assign_value(i, 88, 149)
} else if (copy$age[i] < 50) {
assign_value(i, 85, 145)
} else if (copy$age[i] < 55) {
assign_value(i, 83, 140)
} else if (copy$age[i] < 60) {
assign_value(i, 80, 136)
} else if (copy$age[i] < 65) {
assign_value(i, 78, 132)
} else {
assign_value(i, 75, 128)
}
}
as.data.frame(table(data$thalach))
sorted_values = sort(data$oldpeak)
v1 = sorted_values[length(sorted_values) / 3]
v2 = sorted_values[2 * length(sorted_values) / 3]
for (i in 1:length(sorted_values)) {
if (data$oldpeak[i] < v1) {
data$oldpeak[i] = paste0("oldpeak [", (min(sorted_values)), ",", (v1), ")")
} else if (data$oldpeak[i] < v2) {
data$oldpeak[i] = paste0("oldpeak [",(v1), ",", (v2), ")")
} else {
data$oldpeak[i] = paste0("oldpeak [", (v2), ",", (max(sorted_values)), "]")
}
}
data$fbs = paste0("fbs", data$fbs)
data$restecg = paste0("restecg", data$restecg)
data$slope = paste0("slope", data$slope)
data$thal = paste0("thal", data$thal)
data$ca = paste0("ca", data$ca)
data$target = paste0("target", data$target)
data$exang = paste0("exang", data$exang)
data$sex = paste0("sex", data$sex)
data$cp = paste0("cp", data$cp)
data
library(arules)
library(arulesViz)
trans <- arules::transactions(data)
trans
## transactions in sparse format with
## 303 transactions (rows) and
## 39 items (columns)
We can see that the majority of the individuals (>80%) don’t have diabetes (fbs=0), but have a high cholestelor level, and high blood pressure. Also most of them are old (>50) with 70% probability.
itemFrequencyPlot(trans, topN=20)
rules_descr = c()
for (i in c(0.1, 0.2, 0.3, 0.4, 0.5)) {
for (j in c(0.7, 0.8, 0.9, 1.0)) {
rules <- apriori(trans, supp = i, conf = j, target = "rules", parameter = list(minlen = 2))
rules_descr <- rbind(rules_descr, c(paste0("Supp = ", i, " Conf = ", j), capture.output(print(rules))))
}
}
rules_descr
## [,1] [,2]
## [1,] "Supp = 0.1 Conf = 0.7" "set of 22990 rules "
## [2,] "Supp = 0.1 Conf = 0.8" "set of 16761 rules "
## [3,] "Supp = 0.1 Conf = 0.9" "set of 4473 rules "
## [4,] "Supp = 0.1 Conf = 1" "set of 136 rules "
## [5,] "Supp = 0.2 Conf = 0.7" "set of 2769 rules "
## [6,] "Supp = 0.2 Conf = 0.8" "set of 1984 rules "
## [7,] "Supp = 0.2 Conf = 0.9" "set of 244 rules "
## [8,] "Supp = 0.2 Conf = 1" "set of 0 rules "
## [9,] "Supp = 0.3 Conf = 0.7" "set of 581 rules "
## [10,] "Supp = 0.3 Conf = 0.8" "set of 412 rules "
## [11,] "Supp = 0.3 Conf = 0.9" "set of 18 rules "
## [12,] "Supp = 0.3 Conf = 1" "set of 0 rules "
## [13,] "Supp = 0.4 Conf = 0.7" "set of 117 rules "
## [14,] "Supp = 0.4 Conf = 0.8" "set of 76 rules "
## [15,] "Supp = 0.4 Conf = 0.9" "set of 1 rules "
## [16,] "Supp = 0.4 Conf = 1" "set of 0 rules "
## [17,] "Supp = 0.5 Conf = 0.7" "set of 42 rules "
## [18,] "Supp = 0.5 Conf = 0.8" "set of 28 rules "
## [19,] "Supp = 0.5 Conf = 0.9" "set of 0 rules "
## [20,] "Supp = 0.5 Conf = 1" "set of 0 rules "
rules <- apriori(data, supp = 0.5, conf = 0.8, target = "rules", parameter = list(minlen = 2))
result = DATAFRAME(rules, separate=FALSE)
result[order(-result$confidence),]
plot(rules, engine = "html")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(rules, method = "graph", engine = "html")
rules <- apriori(data, supp = 0.4, conf = 0.9, target = "rules", parameter = list(minlen = 2))
result = DATAFRAME(rules, separate=FALSE)
result[order(-result$confidence),]
rules <- apriori(data, supp = 0.3, conf = 0.9, target = "rules", parameter = list(minlen = 2))
plot(rules, engine = "html")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
We can see that 10% of the people, without chest pain (cp=0), high cholesterol, high blood pressure, and that are old, don’t have any heart problems with 100% confidence.
rules <- apriori(data, supp = 0.1, conf = 1, target = "rules", parameter = list(minlen = 2))
plot(rules, engine = "html")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
targetItems = grep("^target=", itemLabels(trans), value=TRUE)
rules = apriori(trans, supp = 0.25, conf = 0.9, target = "rules", parameter = list(minlen = 2), appearance = list(rhs = targetItems))
result = DATAFRAME(rules, separate=FALSE)
result[order(-result$confidence),]
plot(rules, method = "graph", engine = "html")
rules = apriori(trans, supp = 0.4, conf = 0.7, target = "rules", parameter = list(minlen = 2), appearance = list(rhs = targetItems))
result = DATAFRAME(rules, separate=FALSE)
result[order(-result$confidence),]
plot(rules, method = "graph", engine = "html")
targetItems = grep("^target=target0", itemLabels(trans), value=TRUE)
rules = apriori(trans, supp = 0.3, conf = 0.7, target = "rules", parameter = list(minlen = 2), appearance = list(rhs = targetItems))
result = DATAFRAME(rules, separate=FALSE)
result[order(-result$confidence),]